Esta primera práctica se realizará sobre los datos obtenidos de la página de datos de AirBnB
Run in Google Colab
|
!pip install -q https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import pandas_profiling
url = "http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2019-07-08/data/listings.csv.gz"
url1 = "http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2019-07-08/visualisations/listings.csv"
def download_data(url, filename="listings.csv.gz"):
r = requests.get(url)
with open(filename, "wb") as f:
f.write(r.content)
return
download_data(url)
download_data(url1, "listings_1.csv")
columns = ['id','room_type', 'bathrooms', 'bedrooms', 'bed_type', 'price',
'weekly_price', 'monthly_price', 'security_deposit', 'city',
'state', 'country','first_review','last_review',
'review_scores_rating', 'number_of_reviews','has_availability',
'availability_30', 'availability_60', 'availability_90',
'availability_365']
listings = pd.read_csv("listings.csv.gz", low_memory=False)[columns]
listings_1 = pd.read_csv("listings_1.csv", low_memory=False)
complete_data = pd.merge(listings, listings_1, on="id")
complete_data.columns
Index(['id', 'room_type_x', 'bathrooms', 'bedrooms', 'bed_type', 'price_x',
'city', 'state', 'review_scores_rating', 'number_of_reviews_x',
'availability_30', 'availability_60', 'availability_90',
'availability_365_x', 'host_id', 'neighbourhood', 'reviews_per_month'],
dtype='object')
columns_to_remove = ["availability_365_y", "host_name", "neighbourhood_group",
"number_of_reviews_y", "last_review_y", "room_type_y",
"price_y", "name", "calculated_host_listings_count",
"latitude", "longitude", "weekly_price", "monthly_price",
"security_deposit", "first_review", "last_review_x",
"has_availability", "country", "minimum_nights"]
complete_data.drop(columns=columns_to_remove, axis=1, inplace=True)
complete_data.columns
complete_data.drop_duplicates(inplace=True)
complete_data.dropna(inplace=True)
complete_data["price_x"] = complete_data["price_x"].apply(lambda x: np.float(x[1:].split(".")[0].replace(",", ".")))
profile = pandas_profiling.ProfileReport(complete_data, title="Pandas Profiling Report", html={"style": {"full_width": True}})
profile.to_notebook_iframe()